Loading libraries:
#install.packages("readr")
library("readr")
#install.packages('data.table')
library(data.table)
library(stringr)
#install.packages('ggplot2')
library(ggplot2)
library(gridExtra)
library(grid)
library(plyr)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
##
## compact
## The following object is masked from 'package:data.table':
##
## transpose
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages('flextable')
library(flextable)
##
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
##
## compose
library(tibble)
library(ComplexHeatmap)
## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(RColorBrewer)
library(ggplotify)
library(here)
## here() starts at /home/natasa/Desktop/master_project/Master-Project-Natasa-Mortvanski
##
## Attaching package: 'here'
## The following object is masked from 'package:plyr':
##
## here
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ tidyr 1.3.0
## ✔ lubridate 1.9.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ purrr::compact() masks plyr::compact()
## ✖ flextable::compose() masks purrr::compose()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::desc() masks plyr::desc()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ dplyr::id() masks plyr::id()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ lubridate::second() masks data.table::second()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Load data:
# Load healthy samples' table
all_healthy <- read.csv(here("01_tidy_data", "AGP_healthy.csv.gz"), header = TRUE, sep = ",")
all_healthy <- dplyr::select(all_healthy, sample_id, shannon_entropy, chao1, menhinick, margalef, fisher_alpha, simpson, pielou_evenness, gini_index, strong, simpson, faith_pd, sex, race, host_age, condition)
names(all_healthy)[names(all_healthy) == 'host_age'] <- 'age'
#Load *CDI data from NCBI* data
CDI <- read.csv(here("01_tidy_data", "ncbi_CDI.csv.gz"), header = TRUE, sep = ",")
# Load CDI and donor data from Hospital ClÃnic
hospital_CDI <- read.csv(here("01_tidy_data", "hosp_CDI.csv.gz"), header = TRUE, sep = ",")
hospital_donor <- read.csv(here("01_tidy_data", "hosp_donor.csv.gz"), header = TRUE, sep = ",")
Let’s define vector of names of the alpha diversity metrics that are going to be analysed:
metric <- c("chao1", "margalef", "menhinick", "fisher_alpha", "faith_pd", "gini_index", "strong", "pielou_evenness", "shannon_entropy", "simpson")
Let’s define function for plotting violin plots that we are going to use in the whole analysis:
plot_violin <- function(df, column){
violin <- vector('list', length(metric))
for (i in 1:length(metric)){
mean_line <- df %>% dplyr::group_by(.data[[column]]) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
plot_data <- df %>% dplyr::group_by(.data[[column]]) %>% dplyr::mutate(m = mean(.data[[metric[i]]]))
violin[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(.data[[column]], -m), color = .data[[column]], fill = .data[[column]])) +
geom_violin()+
geom_boxplot(width=0.1, color="grey", alpha=0.2) +
geom_vline(data = mean_line, aes(xintercept = grp_mean, color = .data[[column]]), linetype = "dashed")+
labs(x = metric[i])+
ylab("")+
theme(legend.position="none")
if(metric[i] != "shannon_entropy" & metric[i] != "strong" & metric[i] != "gini_index" & metric[i] != "menhinick"){
violin[[i]] <- violin[[i]] +
scale_x_continuous(trans = 'log10') +
xlab(paste(metric[i], "(log10)", sep = " "))
}
}
return(violin)
}
Function for doing Mann-Whitney-Wilcoxon test:
do_wilcox_test <- function(df, column){
test <- list()
for (i in 1:length(metric)){
test[[i]] <- pairwise.wilcox.test(df[[metric[i]]], df[[column]], p.adjust.method="none") %>%
broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
test[[i]]$p.value <- round(test[[i]]$p.value, digits = 5)
}
tests <- do.call(what = rbind, args = test)
return(tests)
}
First, lets examine difference in alpha diversity before and after FMT in CDI data set:
hospital_CDI$FMT_pre_post <- as.factor(hospital_CDI$FMT_pre_post)
hospital_CDI$FMT_pre_post <- ordered(hospital_CDI$FMT_pre_post, levels = c("pre", "post"))
table1(~ chao1 + margalef + menhinick + fisher_alpha + faith_pd + gini_index + strong + pielou_evenness + shannon_entropy + simpson | FMT_pre_post, data=hospital_CDI)
| pre (N=18) |
post (N=38) |
Overall (N=56) |
|
|---|---|---|---|
| chao1 | |||
| Mean (SD) | 89.3 (36.1) | 133 (61.3) | 119 (57.9) |
| Median [Min, Max] | 88.1 [29.0, 159] | 126 [41.0, 298] | 109 [29.0, 298] |
| margalef | |||
| Mean (SD) | 9.75 (3.92) | 14.1 (5.99) | 12.7 (5.75) |
| Median [Min, Max] | 9.52 [3.25, 17.6] | 13.9 [4.64, 28.2] | 11.6 [3.25, 28.2] |
| menhinick | |||
| Mean (SD) | 1.15 (0.455) | 1.65 (0.695) | 1.49 (0.667) |
| Median [Min, Max] | 1.12 [0.391, 2.06] | 1.63 [0.553, 3.29] | 1.36 [0.391, 3.29] |
| fisher_alpha | |||
| Mean (SD) | 14.6 (6.84) | 22.8 (11.7) | 20.1 (11.0) |
| Median [Min, Max] | 13.9 [4.01, 29.2] | 21.9 [6.01, 52.3] | 17.6 [4.01, 52.3] |
| faith_pd | |||
| Mean (SD) | 13.0 (3.84) | 15.1 (4.82) | 14.4 (4.61) |
| Median [Min, Max] | 12.3 [5.45, 19.8] | 14.3 [7.15, 27.0] | 14.2 [5.45, 27.0] |
| gini_index | |||
| Mean (SD) | 0.997 (0.00156) | 0.996 (0.00200) | 0.996 (0.00197) |
| Median [Min, Max] | 0.997 [0.994, 0.999] | 0.996 [0.991, 0.999] | 0.996 [0.991, 0.999] |
| strong | |||
| Mean (SD) | 0.496 (0.0806) | 0.470 (0.0521) | 0.478 (0.0632) |
| Median [Min, Max] | 0.490 [0.355, 0.660] | 0.473 [0.351, 0.615] | 0.476 [0.351, 0.660] |
| pielou_evenness | |||
| Mean (SD) | 0.815 (0.0685) | 0.848 (0.0410) | 0.837 (0.0531) |
| Median [Min, Max] | 0.829 [0.669, 0.911] | 0.857 [0.698, 0.913] | 0.846 [0.669, 0.913] |
| shannon_entropy | |||
| Mean (SD) | 5.14 (0.828) | 5.78 (0.708) | 5.58 (0.800) |
| Median [Min, Max] | 5.23 [3.92, 6.41] | 5.86 [4.02, 7.04] | 5.68 [3.92, 7.04] |
| simpson | |||
| Mean (SD) | 0.951 (0.0285) | 0.969 (0.0175) | 0.963 (0.0231) |
| Median [Min, Max] | 0.959 [0.892, 0.986] | 0.975 [0.902, 0.990] | 0.971 [0.892, 0.990] |
violin_hospital_2 <- vector('list', length(metric))
violin_hospital_2 <- plot_violin(hospital_CDI, "FMT_pre_post")
grid.arrange(violin_hospital_2[[1]], violin_hospital_2[[2]], violin_hospital_2[[3]], violin_hospital_2[[4]], violin_hospital_2[[5]], violin_hospital_2[[6]], violin_hospital_2[[7]], violin_hospital_2[[8]], violin_hospital_2[[9]], violin_hospital_2[[10]], ncol=3)
Lets see how alpha diversity of CDI data before FMT diverges from stool donor data from Hospital ClÃnic:
hospital_CDI_pre_FMT <- hospital_CDI %>%
filter(FMT_pre_post == "pre")
compare_hospital <- rbind.fill(hospital_donor, hospital_CDI_pre_FMT)
compare_hospital$condition <- as.factor(compare_hospital$condition)
# Sizes of each dataset
table(compare_hospital$condition)
##
## Cdif_pre healthy_donors
## 18 113
table1(~ chao1 + margalef + menhinick + fisher_alpha + faith_pd + gini_index + strong + pielou_evenness + shannon_entropy + simpson | condition, data=compare_hospital)
| Cdif_pre (N=18) |
healthy_donors (N=113) |
Overall (N=131) |
|
|---|---|---|---|
| chao1 | |||
| Mean (SD) | 89.3 (36.1) | 191 (41.1) | 177 (53.5) |
| Median [Min, Max] | 88.1 [29.0, 159] | 189 [103, 300] | 180 [29.0, 300] |
| margalef | |||
| Mean (SD) | 9.75 (3.92) | 19.5 (4.07) | 18.1 (5.25) |
| Median [Min, Max] | 9.52 [3.25, 17.6] | 19.4 [10.6, 29.6] | 18.4 [3.25, 29.6] |
| menhinick | |||
| Mean (SD) | 1.15 (0.455) | 1.54 (0.320) | 1.48 (0.365) |
| Median [Min, Max] | 1.12 [0.391, 2.06] | 1.54 [0.841, 2.34] | 1.50 [0.391, 2.34] |
| fisher_alpha | |||
| Mean (SD) | 14.6 (6.84) | 30.5 (7.55) | 28.3 (9.25) |
| Median [Min, Max] | 13.9 [4.01, 29.2] | 30.3 [14.9, 50.1] | 29.0 [4.01, 50.1] |
| faith_pd | |||
| Mean (SD) | 13.0 (3.84) | 21.7 (3.96) | 20.5 (4.95) |
| Median [Min, Max] | 12.3 [5.45, 19.8] | 20.8 [16.4, 42.0] | 20.5 [5.45, 42.0] |
| gini_index | |||
| Mean (SD) | 0.997 (0.00156) | 0.980 (0.00382) | 0.983 (0.00682) |
| Median [Min, Max] | 0.997 [0.994, 0.999] | 0.981 [0.970, 0.989] | 0.981 [0.970, 0.999] |
| strong | |||
| Mean (SD) | 0.496 (0.0806) | 0.388 (0.0306) | 0.403 (0.0553) |
| Median [Min, Max] | 0.490 [0.355, 0.660] | 0.387 [0.319, 0.453] | 0.395 [0.319, 0.660] |
| pielou_evenness | |||
| Mean (SD) | 0.815 (0.0685) | 0.904 (0.0198) | 0.891 (0.0435) |
| Median [Min, Max] | 0.829 [0.669, 0.911] | 0.907 [0.852, 0.936] | 0.903 [0.669, 0.936] |
| shannon_entropy | |||
| Mean (SD) | 5.14 (0.828) | 6.80 (0.290) | 6.57 (0.699) |
| Median [Min, Max] | 5.23 [3.92, 6.41] | 6.77 [5.87, 7.46] | 6.73 [3.92, 7.46] |
| simpson | |||
| Mean (SD) | 0.951 (0.0285) | 0.987 (0.00389) | 0.982 (0.0165) |
| Median [Min, Max] | 0.959 [0.892, 0.986] | 0.987 [0.970, 0.992] | 0.986 [0.892, 0.992] |
violin_compare <- vector('list', length(metric))
violin_compare <- plot_violin(compare_hospital, "condition")
#violin_compare
grid.arrange(violin_compare[[1]], violin_compare[[2]], violin_compare[[3]], violin_compare[[4]], violin_compare[[5]], violin_compare[[6]], violin_compare[[7]], violin_compare[[8]], violin_compare[[9]], violin_compare[[10]], ncol=3)
test_CDI_healthy <- list()
test_CDI_healthy <- do_wilcox_test(compare_hospital, "condition")
test_CDI_healthy_1 <- test_CDI_healthy %>%
add_column(p.adjusted = round(p.adjust(test_CDI_healthy$p.value, "fdr"), digits=5), .after='p.value') %>%
arrange(p.value, parameter) %>%
flextable() %>%
bold(~ p.value < 0.05, 4) %>%
bold(~ p.adjusted < 0.05, 5) %>%
add_header_lines(values = "Results of the Mann-Whitney-Wilcoxon test for distributions of Hospital ClÃnic's CDI samples vs healthy control")
test_CDI_healthy_1
Results of the Mann-Whitney-Wilcoxon test for distributions of Hospital ClÃnic's CDI samples vs healthy control | ||||
|---|---|---|---|---|
parameter | group1 | group2 | p.value | p.adjusted |
chao1 | healthy_donors | Cdif_pre | 0.00000 | 0.00000 |
faith_pd | healthy_donors | Cdif_pre | 0.00000 | 0.00000 |
fisher_alpha | healthy_donors | Cdif_pre | 0.00000 | 0.00000 |
gini_index | healthy_donors | Cdif_pre | 0.00000 | 0.00000 |
margalef | healthy_donors | Cdif_pre | 0.00000 | 0.00000 |
pielou_evenness | healthy_donors | Cdif_pre | 0.00000 | 0.00000 |
shannon_entropy | healthy_donors | Cdif_pre | 0.00000 | 0.00000 |
simpson | healthy_donors | Cdif_pre | 0.00000 | 0.00000 |
strong | healthy_donors | Cdif_pre | 0.00000 | 0.00000 |
menhinick | healthy_donors | Cdif_pre | 0.00049 | 0.00049 |
library(ROSE)
## Loaded ROSE 0.0-4
# Let's make training and testing subset of data
#make this example reproducible
set.seed(1)
#use 70% of dataset as training set and 30% as test set
sample <- sample(c(TRUE, FALSE), nrow(compare_hospital), replace=TRUE, prob=c(0.7,0.3))
train <- compare_hospital[sample, ]
test <- compare_hospital[!sample, ]
table(train$condition)
##
## Cdif_pre healthy_donors
## 15 77
table(test$condition)
##
## Cdif_pre healthy_donors
## 3 36
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
richness <- c("chao1", "margalef", "menhinick", "fisher_alpha", "faith_pd")
evenness <- c("gini_index", "strong", "pielou_evenness", "shannon_entropy", "simpson")
results_accuracy_CDI <- data.frame(model = character(0), accuracy = numeric(0))
model_all <- randomForest(condition ~ shannon_entropy + chao1 + menhinick + margalef + fisher_alpha + simpson + gini_index + strong + pielou_evenness + faith_pd, data = train, importance=TRUE)
prediction_CDI_all <- predict(model_all, test)
confusion_matrix <- confusionMatrix(prediction_CDI_all, test$condition)
accuracy_CDI_all <- confusion_matrix$overall["Accuracy"]
for (a in richness){
for (b in evenness){
formula <- as.formula(sprintf("%s ~ %s + %s", "condition", a, b))
model <- randomForest(formula, data = train, importance=TRUE)
# Calculating accuracy
prediction <- predict(model, test)
confusion_matrix <- confusionMatrix(prediction, test$condition)
accuracy <- confusion_matrix$overall["Accuracy"]
new_row <- c(model = sprintf("%s + %s", a, b), accuracy = accuracy)
results_accuracy_CDI <- rbind(results_accuracy_CDI, new_row)
}
}
names(results_accuracy_CDI)[1] <- 'model'
names(results_accuracy_CDI)[2] <- 'accuracy'
results_accuracy_CDI[nrow(results_accuracy_CDI)+1,] <- c("all alpha metrics", accuracy_CDI_all)
results_accuracy_CDI$accuracy <- as.numeric(results_accuracy_CDI$accuracy)
results_accuracy_CDI <- results_accuracy_CDI[order(-results_accuracy_CDI$accuracy),]
results_accuracy_CDI[1:10,] %>%
flextable() %>%
add_header_lines(values = "Accuracy of random forest classifier trained on CDI and healthy datasets in differnet models")
Accuracy of random forest classifier trained on CDI and healthy datasets in differnet models | |
|---|---|
model | accuracy |
chao1 + gini_index | 1.000000 |
margalef + gini_index | 1.000000 |
menhinick + gini_index | 1.000000 |
menhinick + strong | 1.000000 |
menhinick + shannon_entropy | 1.000000 |
fisher_alpha + gini_index | 1.000000 |
faith_pd + gini_index | 1.000000 |
all alpha metrics | 1.000000 |
chao1 + strong | 0.974359 |
menhinick + pielou_evenness | 0.974359 |
set.seed(1)
#use 70% of dataset as healthy population and 30% as healthy samples to test
sample <- sample(c(TRUE, FALSE), nrow(hospital_donor), replace=TRUE, prob=c(0.7,0.3))
healthy_popul <- hospital_donor[sample, ]
healthy_test <- hospital_donor[!sample, ]
nrow(healthy_popul)
## [1] 77
nrow(healthy_test)
## [1] 36
# samples to test
test_samples <- rbind.fill(healthy_test, hospital_CDI)
#test_samples <- rbind.fill(healthy_popul, CDI)
nrow(test_samples)
## [1] 92
library(table1)
CrawfordHowell <- function(case, control){
tval <- (case - mean(control)) / (sd(control)*sqrt((length(control)+1) / length(control)))
degfree <- length(control)-1
pval <- 2*(1-pt(abs(tval), df=degfree)) #two-tailed p-value
result <- data.frame(t = tval, df = degfree, p=pval)
return(result)
}
#table_list <- list()
for (i in 1:length(metric)){
t_statistics <- list()
t_prob <- list()
result <- data.frame()
for (n in 1:nrow(test_samples)){
result <- CrawfordHowell(test_samples[[metric[i]]][n], healthy_popul[[metric[i]]])
t_statistics <- append(t_statistics, result[1])
t_prob <- append(t_prob, result[3])
}
t_test_results <- test_samples[, c("sample_id", "condition")]
t_test_results$metric <- test_samples[[metric[i]]]
t_test_results$t_statistic <- unlist(t_statistics)
t_test_results$t_probability <- unlist(t_prob)
#table_list[[i]] <- t_test_results
print(table1(~ metric + t_probability | condition, data=t_test_results, caption = paste("Results for alpha metric:", metric[i], " ")))
}
## <table class="Rtable1"><caption>Results for alpha metric: chao1 </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>133 (61.3)</td>
## <td>89.3 (36.1)</td>
## <td>191 (44.5)</td>
## <td>147 (63.5)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>126 [41.0, 298]</td>
## <td class='lastrow'>88.1 [29.0, 159]</td>
## <td class='lastrow'>189 [117, 300]</td>
## <td class='lastrow'>147 [29.0, 300]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.228 (0.299)</td>
## <td>0.0589 (0.110)</td>
## <td>0.449 (0.278)</td>
## <td>0.282 (0.301)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0838 [0.000341, 0.978]</td>
## <td class='lastrow'>0.0120 [0.000122, 0.420]</td>
## <td class='lastrow'>0.449 [0.00834, 0.992]</td>
## <td class='lastrow'>0.155 [0.000122, 0.992]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: margalef </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>14.1 (5.99)</td>
## <td>9.75 (3.92)</td>
## <td>19.5 (4.44)</td>
## <td>15.4 (6.22)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>13.9 [4.64, 28.2]</td>
## <td class='lastrow'>9.52 [3.25, 17.6]</td>
## <td class='lastrow'>19.4 [11.9, 29.6]</td>
## <td class='lastrow'>15.4 [3.25, 29.6]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.244 (0.309)</td>
## <td>0.0830 (0.165)</td>
## <td>0.438 (0.278)</td>
## <td>0.288 (0.303)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0647 [0.000335, 0.960]</td>
## <td class='lastrow'>0.0141 [0.0000989, 0.645]</td>
## <td class='lastrow'>0.436 [0.0119, 0.963]</td>
## <td class='lastrow'>0.182 [0.0000989, 0.963]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: menhinick </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>1.65 (0.695)</td>
## <td>1.15 (0.455)</td>
## <td>1.54 (0.349)</td>
## <td>1.51 (0.563)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>1.63 [0.553, 3.29]</td>
## <td class='lastrow'>1.12 [0.391, 2.06]</td>
## <td class='lastrow'>1.53 [0.939, 2.34]</td>
## <td class='lastrow'>1.44 [0.391, 3.29]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.223 (0.240)</td>
## <td>0.301 (0.342)</td>
## <td>0.438 (0.278)</td>
## <td>0.323 (0.290)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.113 [0.000000258, 0.896]</td>
## <td class='lastrow'>0.119 [0.000408, 0.931]</td>
## <td class='lastrow'>0.436 [0.0119, 0.963]</td>
## <td class='lastrow'>0.271 [0.000000258, 0.963]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: fisher_alpha </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>22.8 (11.7)</td>
## <td>14.6 (6.84)</td>
## <td>30.6 (8.25)</td>
## <td>24.2 (11.2)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>21.9 [6.01, 52.3]</td>
## <td class='lastrow'>13.9 [4.01, 29.2]</td>
## <td class='lastrow'>30.2 [16.9, 50.1]</td>
## <td class='lastrow'>23.7 [4.01, 52.3]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.277 (0.333)</td>
## <td>0.119 (0.220)</td>
## <td>0.440 (0.279)</td>
## <td>0.310 (0.314)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0671 [0.00125, 0.990]</td>
## <td class='lastrow'>0.0261 [0.000520, 0.857]</td>
## <td class='lastrow'>0.440 [0.00883, 0.981]</td>
## <td class='lastrow'>0.178 [0.000520, 0.990]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: faith_pd </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>15.1 (4.82)</td>
## <td>13.0 (3.84)</td>
## <td>22.6 (5.91)</td>
## <td>17.6 (6.50)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>14.3 [7.15, 27.0]</td>
## <td class='lastrow'>12.3 [5.45, 19.8]</td>
## <td class='lastrow'>20.8 [16.4, 42.0]</td>
## <td class='lastrow'>17.7 [5.45, 42.0]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.161 (0.292)</td>
## <td>0.0657 (0.150)</td>
## <td>0.419 (0.310)</td>
## <td>0.243 (0.311)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.00769 [0.000000450, 0.944]</td>
## <td class='lastrow'>0.000811 [0.0000000286, 0.558]</td>
## <td class='lastrow'>0.406 [0.00000000000715, 0.998]</td>
## <td class='lastrow'>0.0591 [0.00000000000715, 0.998]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: gini_index </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.996 (0.00200)</td>
## <td>0.997 (0.00156)</td>
## <td>0.980 (0.00416)</td>
## <td>0.990 (0.00847)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.996 [0.991, 0.999]</td>
## <td class='lastrow'>0.997 [0.994, 0.999]</td>
## <td class='lastrow'>0.980 [0.971, 0.988]</td>
## <td class='lastrow'>0.995 [0.971, 0.999]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.000582 (0.00145)</td>
## <td>0.0000744 (0.000129)</td>
## <td>0.489 (0.322)</td>
## <td>0.191 (0.312)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0000756 [0.00000400, 0.00568]</td>
## <td class='lastrow'>0.0000186 [0.00000400, 0.000439]</td>
## <td class='lastrow'>0.516 [0.0156, 0.974]</td>
## <td class='lastrow'>0.000250 [0.00000400, 0.974]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: strong </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.470 (0.0521)</td>
## <td>0.496 (0.0806)</td>
## <td>0.384 (0.0270)</td>
## <td>0.441 (0.0695)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.473 [0.351, 0.615]</td>
## <td class='lastrow'>0.490 [0.355, 0.660]</td>
## <td class='lastrow'>0.386 [0.331, 0.452]</td>
## <td class='lastrow'>0.427 [0.331, 0.660]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.127 (0.217)</td>
## <td>0.116 (0.203)</td>
## <td>0.563 (0.285)</td>
## <td>0.296 (0.323)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0126 [0.00000000105, 0.887]</td>
## <td class='lastrow'>0.00538 [0.00000000000240, 0.644]</td>
## <td class='lastrow'>0.574 [0.0590, 0.971]</td>
## <td class='lastrow'>0.172 [0.00000000000240, 0.971]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: pielou_evenness </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.848 (0.0410)</td>
## <td>0.815 (0.0685)</td>
## <td>0.908 (0.0161)</td>
## <td>0.865 (0.0548)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.857 [0.698, 0.913]</td>
## <td class='lastrow'>0.829 [0.669, 0.911]</td>
## <td class='lastrow'>0.911 [0.878, 0.934]</td>
## <td class='lastrow'>0.880 [0.669, 0.934]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.167 (0.246)</td>
## <td>0.139 (0.265)</td>
## <td>0.526 (0.248)</td>
## <td>0.302 (0.307)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0401 [0.0000000000000107, 0.844]</td>
## <td class='lastrow'>0.00146 [0, 0.914]</td>
## <td class='lastrow'>0.552 [0.129, 0.961]</td>
## <td class='lastrow'>0.266 [0, 0.961]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: shannon_entropy </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>5.78 (0.708)</td>
## <td>5.14 (0.828)</td>
## <td>6.83 (0.307)</td>
## <td>6.07 (0.895)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>5.86 [4.02, 7.04]</td>
## <td class='lastrow'>5.23 [3.92, 6.41]</td>
## <td class='lastrow'>6.81 [6.14, 7.39]</td>
## <td class='lastrow'>6.25 [3.92, 7.39]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.0784 (0.166)</td>
## <td>0.0222 (0.0594)</td>
## <td>0.487 (0.323)</td>
## <td>0.227 (0.310)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.00180 [0.00000000000000511, 0.749]</td>
## <td class='lastrow'>0.00000460 [0.00000000000000111, 0.197]</td>
## <td class='lastrow'>0.465 [0.0257, 0.987]</td>
## <td class='lastrow'>0.0447 [0.00000000000000111, 0.987]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: simpson </caption>
##
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.969 (0.0175)</td>
## <td>0.951 (0.0285)</td>
## <td>0.987 (0.00307)</td>
## <td>0.973 (0.0216)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.975 [0.902, 0.990]</td>
## <td class='lastrow'>0.959 [0.892, 0.986]</td>
## <td class='lastrow'>0.988 [0.979, 0.992]</td>
## <td class='lastrow'>0.982 [0.892, 0.992]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.135 (0.219)</td>
## <td>0.0987 (0.237)</td>
## <td>0.539 (0.231)</td>
## <td>0.286 (0.304)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.00753 [0, 0.759]</td>
## <td class='lastrow'>0.0000000179 [0, 0.877]</td>
## <td class='lastrow'>0.597 [0.0989, 0.993]</td>
## <td class='lastrow'>0.165 [0, 0.993]</td>
## </tr>
## </tbody>
## </table>
# for (i in 1:length(metric)){
# table1(~ metric + t_probability | condition, data=table_list[[i]], caption = paste("Results for alpha metric:", metric[i], " "))
# }
Compare Hospital’s data to AGP healthy data and CDI dataset from BioProject:
all_data_comparison <- rbind.fill(all_healthy, CDI, hospital_donor, hospital_CDI)
all_data_comparison <- dplyr::select(all_data_comparison, shannon_entropy, chao1, menhinick, margalef, fisher_alpha, simpson, gini_index, strong, pielou_evenness, faith_pd, condition)
all_data_comparison$condition[all_data_comparison$condition=="healthy"] <- "healthy_AGP"
all_data_comparison$condition[all_data_comparison$condition=="CDI"] <- "CDI_BioProject"
all_data_comparison$condition[all_data_comparison$condition=="Cdif_post"] <- "hospital_CDI_post"
all_data_comparison$condition[all_data_comparison$condition=="Cdif_pre"] <- "hospital_CDI_pre"
violin_hospital_compare <- vector('list', length(metric))
violin_hospital_compare <- plot_violin(all_data_comparison, "condition")
grid.arrange(violin_hospital_compare[[1]], violin_hospital_compare[[2]], violin_hospital_compare[[3]], violin_hospital_compare[[4]], ncol=2)
grid.arrange(violin_hospital_compare[[5]], violin_hospital_compare[[6]], violin_hospital_compare[[7]], violin_hospital_compare[[8]], ncol=2)
grid.arrange(violin_hospital_compare[[9]], violin_hospital_compare[[10]], ncol=2)
From tutorial: https://uw-madison-microbiome-hub.github.io/Microbiome_analysis_in-_R/ and: https://rpubs.com/mohsen/lefse_analysis_cleaned_prevalence_phyloseq
#BiocManager::install("phyloseq")
#remotes::install_github("jbisanz/qiime2R")
#BiocManager::install("microbiome")
#remotes::install_github("microsud/microbiomeutilities")
library(phyloseq)
library(qiime2R)
library(microbiome)
##
## microbiome R package (microbiome.github.com)
##
##
##
## Copyright (C) 2011-2021 Leo Lahti,
## Sudarshan Shetty et al. <microbiome.github.io>
##
## Attaching package: 'microbiome'
## The following object is masked from 'package:ggplot2':
##
## alpha
## The following object is masked from 'package:base':
##
## transform
library(microbiomeutilities)
## Warning: replacing previous import 'ggplot2::alpha' by 'microbiome::alpha' when
## loading 'microbiomeutilities'
# Importing ASVs abundance file
ASVs <- read_qza(here("00_raw_data/09_hospital_data_qiime2","table-CDI.qza"))
# Importing metadata
metadata <- read.delim(here("00_raw_data/07_hospital_CDI", "sample-metadata.tsv.gz"))
rownames(metadata) <- metadata[,1]
metadata$time <- as.numeric(metadata$time)
# Importing tree
tree <- read_qza(here("00_raw_data/09_hospital_data_qiime2","rooted-tree-CDI.qza"))
# Importing taxonomy
taxonomy <- read_qza(here("00_raw_data/09_hospital_data_qiime2","taxonomy-CDI.qza"))
taxonomy_table <- parse_taxonomy(taxonomy$data)
tax_table <- do.call(rbind, strsplit(as.character(taxonomy$data$Taxon), "; "))
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)
colnames(tax_table) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
rownames(tax_table) <- taxonomy$data$Feature.ID
# Creating phyloseq object
physeq <- phyloseq(
otu_table(ASVs$data, taxa_are_rows = TRUE),
phy_tree(tree$data),
tax_table(tax_table),
sample_data(metadata)
)
#Subset the data to keep only Receptor samples.
physeq.rec <- subset_samples(physeq, sample.purpose == "Receptor")
for(i in 1:nsamples(physeq.rec)) {
if(sample_data(physeq.rec)$time[i] <= 0){
sample_data(physeq.rec)$FMT_pre_post[i] <- "pre"
} else {
sample_data(physeq.rec)$FMT_pre_post[i] <- "post"
}
}
sample_data(physeq.rec)$FMT_pre_post <- as.factor(sample_data(physeq.rec)$FMT_pre_post)
sample_data(physeq.rec)$FMT_pre_post <- relevel(sample_data(physeq.rec)$FMT_pre_post, "pre")
levels(sample_data(physeq.rec)$FMT_pre_post)
## [1] "pre" "post"
#BiocManager::install("ALDEx2")
#library(ALDEx2)
#aldex2_da <- ALDEx2::aldex(data.frame(phyloseq::otu_table(physeq.rec)), phyloseq::sample_data(physeq.rec)$FMT_pre_post, test="t", effect = TRUE, denom="iqlr")
## Vulcano plot for differentially expressed OTUs
#ALDEx2::aldex.plot(aldex2_da, type="MW", test="wilcox", called.cex = 1, cutoff = 0.1)
ntaxa(physeq.rec)
## [1] 16722
nsamples(physeq.rec)
## [1] 112
otu_table(physeq.rec)[1:5, 1:5]
## OTU Table: [5 taxa and 5 samples]
## taxa are rows
## CD003-M-1 CD003-M-n CD003-M2 CD003-M4
## 7841644759c6fc1e7b024272b6dcf462 0 0 0 0
## 555cdf14776844cf94aeb170e12170de 0 6 0 0
## fc885b5c51d9581397c9018fe2cc85d3 0 0 0 0
## ee00598097dd659ddea3f91273ef97f1 3 0 0 0
## f2161a1901a6fb0ba3b90ce278930c8b 0 0 0 0
## CD003-M90
## 7841644759c6fc1e7b024272b6dcf462 0
## 555cdf14776844cf94aeb170e12170de 0
## fc885b5c51d9581397c9018fe2cc85d3 0
## ee00598097dd659ddea3f91273ef97f1 0
## f2161a1901a6fb0ba3b90ce278930c8b 0
tax_table(physeq.rec)[1:5, 1:4]
## Taxonomy Table: [5 taxa by 4 taxonomic ranks]:
## Kingdom Phylum Class
## 7841644759c6fc1e7b024272b6dcf462 "k__Bacteria" "k__Bacteria" "k__Bacteria"
## 555cdf14776844cf94aeb170e12170de "k__Bacteria" "p__OD1" "c__"
## fc885b5c51d9581397c9018fe2cc85d3 "k__Bacteria" "p__OD1" "c__"
## ee00598097dd659ddea3f91273ef97f1 "k__Bacteria" "k__Bacteria" "k__Bacteria"
## f2161a1901a6fb0ba3b90ce278930c8b "k__Bacteria" "k__Bacteria" "k__Bacteria"
## Order
## 7841644759c6fc1e7b024272b6dcf462 "k__Bacteria"
## 555cdf14776844cf94aeb170e12170de "o__"
## fc885b5c51d9581397c9018fe2cc85d3 "o__"
## ee00598097dd659ddea3f91273ef97f1 "k__Bacteria"
## f2161a1901a6fb0ba3b90ce278930c8b "k__Bacteria"
# Prune and rarefy samples
# Keep samples with non-zero total taxa
pruned <- prune_samples(sample_sums(physeq.rec) > 0, physeq.rec)
# Check total taxa in each sample again
sample_sums(pruned)
## CD003-M-1 CD003-M-n CD003-M2 CD003-M4 CD003-M90 CD005-M-1
## 23127 66753 11518 15114 7832 691
## CD005-M2 CD005-M7 CD005-M90 CD006-M2 CD006-M7 CD006-M90
## 18367 11422 16151 54268 13651 12670
## CD007-M-1 CD007-M2 CD007-M90 CD008-M-1 CD008-M-n CD008-M2
## 18494 13948 18290 13488 15800 22539
## CD008-M7 CD008-M90 CD010-M-n CD010-M2 CD010-M7 CD011-M-1
## 16727 18092 21150 16548 23071 8754
## CD011-M-n CD011-M2 CD011-M7 CD011-M90 CD012-M-n CD012-M2
## 16169 11663 15454 12021 9787 8619
## CD012-M7 CD012-M90 CD013-M-1 CD013-M-n CD013-M2 CD013-M7
## 12514 15027 9787 19680 17429 18001
## CD014-M-n CD014-M2-a CD014-M2-b CD014-M7 CD015-M-n CD015-M2
## 18408 28218 25480 12155 7475 9797
## CD015-M7 CD015-M90 CD016-M-1 CD016-M-n CD016-M2 CD016-M7
## 32375 21725 13175 14429 11145 12135
## CD017-M-1 CD017-M-n CD017-M2 CD017-M7 CD018-M-1 CD018-M-n-a
## 13110 15580 12787 20847 8630 25783
## CD018-M-n-b CD018-M11 CD019-M-1 CD019-M-n CD019-M2 CD019-M7
## 22231 20305 8715 11447 14326 13832
## CD020-M-n-a CD020-M-n-b CD020-M2 CD020-M7 CD021M-n CD023-M-1
## 15385 1214 23699 9689 17428 4624
## CD023-M-n CD023-M2 CD023-M7 CD025-M-1 CD025-M2 CD026-M-1
## 13184 10007 15449 7030 7579 10045
## CD026-M-n CD026-M2-a CD026-M2-b CD026-M7-a CD026-M7-b CD026-M90
## 6465 13894 17400 10844 12575 15050
## CD028-M-1 CD028-M-n CD028-M2-b CD028-M7 CD028-M90 CD029-M-n
## 7107 16203 16228 14942 19831 12906
## CD029-M2 CD029-M7 CD029-M90 CD030-M-n CD030-M2 CD030-M30-n
## 16682 19111 11080 14222 6816 11712
## CD030-M7 CD031-M2 CD031-M7 CD031M-1 CD033-M-1 CD033-M5
## 10909 22764 20418 19353 4504 17612
## CD033-M7 CD033-M90 CD034-M-1 CD034-M-n CD034-M2 CD034-M7
## 17815 46321 11816 15245 21461 13800
## CD035-M-n CD035-M2 CD035-M7 CD036-M-1 CD036-M-n CD036-M2
## 22023 31698 24964 13609 10941 5731
## CD036-M7 CD036-M90-a CD036-M90-b
## 7356 5593 2767
#rarefy samples
physeq_rarefy <- rarefy_even_depth(pruned, rngseed=1, sample.size=0.9*min(sample_sums(pruned)), replace=F)
## `set.seed(1)` was used to initialize repeatable random subsampling.
## Please record this for your records so others can reproduce.
## Try `set.seed(1); .Random.seed` for the full vector
## ...
## 13799OTUs were removed because they are no longer
## present in any sample after random subsampling
## ...
#Check the taxa prevalence at Phylum level
#plot_taxa_prevalence(physeq_rarefy, "Order")
Barplots are a one way of visualising the composition of your samples. At Family level and relative abundance
# convert to relative abundance
physeq.rec.rel <- microbiome::transform(physeq.rec, "compositional")
physeq.rec.rel2 <- prune_taxa(taxa_sums(physeq.rec.rel) > 0, physeq.rec.rel)
Check for the core ASVs
core.taxa.standard <- core_members(physeq.rec.rel2, detection = 0.001, prevalence = 50/100)
print(core.taxa.standard)
## [1] "4f3d767e0404daed3f01328053b70ae8"
we only see IDs, not very informative. We can get the classification of these as below.
#install.packages('DT')
library(DT)
# Extract the taxonomy table
taxonomy_core <- as.data.frame(tax_table(physeq.rec.rel2))
# Subset this taxonomy table to include only core OTUs
core_taxa_id <- subset(taxonomy_core, rownames(taxonomy_core) %in% core.taxa.standard)
DT::datatable(core_taxa_id)
physeq.fam.rel <- physeq.rec %>%
aggregate_rare(level = "Family", detection = 1/100, prevalence = 75/100) %>%
microbiome::transform(transform = "compositional")
plot_composition(physeq.fam.rel, sample.sort = "FMT_pre_post", x.label = "FMT_pre_post") +
theme(legend.position = "bottom") +
scale_fill_brewer("Family", palette = "Paired") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size = 10)) +
ggtitle("Relative abundance") +
theme(legend.title = element_text(size = 18))
library(RColorBrewer)
physeq.fam.rel <- physeq.rec %>%
aggregate_rare(level = "Family", detection = 1/100, prevalence = 50/100) %>%
microbiome::transform(transform = "compositional")
colourCount = ntaxa(tax_table(physeq.fam.rel))
getPalette = colorRampPalette(brewer.pal(13, "Paired"))
## Warning in brewer.pal(13, "Paired"): n too large, allowed maximum for palette Paired is 12
## Returning the palette you asked for with that many colors
plot_composition(physeq.fam.rel,
average_by = "FMT_pre_post",
transform = "compositional") +
scale_fill_manual(values = getPalette(colourCount)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size = 10)) +
ggtitle("Relative abundance") +
theme(legend.title = element_text(size = 18))
phyloseq::plot_bar(physeq.fam.rel, fill = "Family") +
geom_bar(aes(fill = Family), stat = "identity", position = "stack") +
labs(x = "", y = "Relative Abundance\n") +
facet_wrap(~ FMT_pre_post, scales = "free", nrow =2) +
theme(panel.background = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_manual(values = getPalette(colourCount))
# install.packages(
# "microViz",
# repos = c(davidbarnett = "https://david-barnett.r-universe.dev", getOption("repos"))
# )
#library(microViz)
#
# physeq.rec <- physeq.rec %>%
# tax_fix(
# min_length = 4,
# unknowns = c("k__Bacteria", "p__Firmicutes", "p__Proteobacteria", "p__OD1 Phylum", "p__OD1"),
# sep = " ", anon_unique = TRUE,
# suffix_rank = "classified"
# )
#
#
# plot_family <- comp_barplot(
# physeq.rec.rel2,
# "Family",
# n_taxa = 4,
# tax_order = sum,
# merge_other = TRUE,
# sample_order = "bray",
# order_with_all_taxa = FALSE,
# label = "SAMPLE",
# facet_by = "condition",
# bar_width = 1,
# bar_outline_colour = NA,
# #palette = distinct_palette(n_taxa),
# tax_transform_for_ordering = "identity",
# tax_transform_for_plot = "compositional",
# #seriate_method = "OLO_ward",
# keep_all_vars = TRUE,
# #interactive = FALSE,
# x = "SAMPLE"
# )
#
# plot_family
mycols <- c("coral", "steelblue2")
physeq.fam <- aggregate_taxa(physeq.rec, "Family")
top_f <- top_taxa(physeq.fam, 9)
top_f
## [1] "f__Enterobacteriaceae" "f__Lachnospiraceae" "f__Verrucomicrobiaceae"
## [4] "f__Ruminococcaceae" "f__Veillonellaceae" "f__Bacteroidaceae"
## [7] "f__Enterococcaceae" "f__Lactobacillaceae" "f__Streptococcaceae"
top_fams <- plot_listed_taxa(physeq.fam, top_f,
group= "FMT_pre_post",
group.colors = mycols,
add.violin = F,
dot.opacity = 0.25,
box.opacity = 0.25,
panel.arrange= "wrap")
## An additonal column Sam_rep with sample names is created for reference purpose
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the microbiomeutilities package.
## Please report the issue at
## <https://github.com/microsud/microbiomeutilities/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
top_fams
dom.tax <- dominant_taxa(physeq.rec,level = "Family", group="FMT_pre_post")
dom.tax_pre <- dom.tax$dominant_overview %>% filter(FMT_pre_post == "pre")
dom.tax_post <- dom.tax$dominant_overview %>% filter(FMT_pre_post == "post")
head(dom.tax$dominant_overview)
## # A tibble: 6 × 5
## # Groups: FMT_pre_post [2]
## FMT_pre_post dominant_taxa n rel.freq rel.freq.pct
## <fct> <chr> <int> <dbl> <chr>
## 1 post f__Enterobacteriaceae 13 18.6 19%
## 2 post f__Lachnospiraceae 13 18.6 19%
## 3 post f__Verrucomicrobiaceae 11 15.7 16%
## 4 post f__Ruminococcaceae 10 14.3 14%
## 5 post f__Bacteroidaceae 9 12.9 13%
## 6 pre f__Verrucomicrobiaceae 7 16.7 17%
head(dom.tax_pre, 10) %>%
flextable()
FMT_pre_post | dominant_taxa | n | rel.freq | rel.freq.pct |
|---|---|---|---|---|
pre | f__Verrucomicrobiaceae | 7 | 16.7 | 17% |
pre | f__Enterobacteriaceae | 6 | 14.3 | 14% |
pre | f__Enterococcaceae | 6 | 14.3 | 14% |
pre | f__Veillonellaceae | 6 | 14.3 | 14% |
pre | f__Bacteroidaceae | 5 | 11.9 | 12% |
pre | f__Lactobacillaceae | 5 | 11.9 | 12% |
pre | f__Ruminococcaceae | 3 | 7.1 | 7% |
pre | f__Lachnospiraceae | 2 | 4.8 | 5% |
pre | f__Fusobacteriaceae | 1 | 2.4 | 2% |
pre | f__Porphyromonadaceae | 1 | 2.4 | 2% |
head(dom.tax_post, 10)%>%
flextable()
FMT_pre_post | dominant_taxa | n | rel.freq | rel.freq.pct |
|---|---|---|---|---|
post | f__Enterobacteriaceae | 13 | 18.6 | 19% |
post | f__Lachnospiraceae | 13 | 18.6 | 19% |
post | f__Verrucomicrobiaceae | 11 | 15.7 | 16% |
post | f__Ruminococcaceae | 10 | 14.3 | 14% |
post | f__Bacteroidaceae | 9 | 12.9 | 13% |
post | f__Enterococcaceae | 4 | 5.7 | 6% |
post | f__Lactobacillaceae | 4 | 5.7 | 6% |
post | f__Veillonellaceae | 2 | 2.9 | 3% |
post | f__Bifidobacteriaceae | 1 | 1.4 | 1% |
post | f__Desulfovibrionaceae | 1 | 1.4 | 1% |
grp_abund <- get_group_abundances(physeq.rec,
level = "Family",
group="FMT_pre_post",
transform = "compositional")
## An additonal column Sam_rep with sample names is created for reference purpose
# clean names
grp_abund$OTUID <- gsub("f__", "",grp_abund$OTUID)
grp_abund$OTUID <- ifelse(grp_abund$OTUID == "",
"Unclassified", grp_abund$OTUID)
grp_abund_sorted <- grp_abund[order(grp_abund$mean_abundance, decreasing=TRUE),]
mean.plot_sorted <- grp_abund_sorted[1:40,] %>% # input data
ggplot(aes(x= reorder(OTUID, mean_abundance), # reroder based on mean abundance
y= mean_abundance,
fill=FMT_pre_post)) +
geom_bar(stat = "identity",
position = position_dodge()) +
scale_fill_manual("FMT_pre_post", values=mycols) + # manually specify colors
theme_bw() +
ylab("Mean Relative Abundance") +
xlab("Family") +
coord_flip()
# +
# scale_fill_manual(values = getPalette(3))
mean.plot_sorted
Tree plot
physeq_top_50 <- subset_taxa(physeq.rec, Kingdom=="k__Bacteria")
physeq_top_50 <- prune_taxa(names(sort(taxa_sums(physeq_top_50),TRUE)[1:50]), physeq_top_50)
# Color the nodes by category
plot_tree(physeq_top_50, nodelabf=nodeplotblank, label.tips="Genus", ladderize="left", color="FMT_pre_post")
# Convert to radial tree
plot_tree(physeq_top_50, nodelabf=nodeplotblank, label.tips="Genus", ladderize="left", color="FMT_pre_post") + coord_polar(theta="y")
Lefse to test for differential abundance between categories
from: https://rpubs.com/mohsen/lefse_analysis_cleaned_prevalence_phyloseq
# Install microbiome marker package
#remotes::install_version('ggplot2', version='3.3.6')
#BiocManager::install("ggtree")
#BiocManager::install("microbiomeMarker")
# activate microbiome marker package
library(microbiomeMarker)
## Registered S3 method overwritten by 'ggtree':
## method from
## identify.gg ggfun
##
## Attaching package: 'microbiomeMarker'
## The following objects are masked from 'package:microbiome':
##
## abundances, aggregate_taxa
## The following object is masked from 'package:qiime2R':
##
## summarize_taxa
## The following object is masked from 'package:phyloseq':
##
## plot_heatmap
# Differential abundance using lefse
lefse <- run_lefse(physeq.rec,
group = "FMT_pre_post",
taxa_rank = "Family",
wilcoxon_cutoff = 0.05,
norm = "CPM",
kw_cutoff = 0.05,
multigrp_strat = TRUE,
lda_cutoff = 2
)
## Warning in preprocess_ps(ps): The library size of sample(s): CD028-M2-a is/are
## zero, and will be removed in the subsequent analysis.
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
#head(marker_table(lefse))
# bar plot
plot_ef_bar(lefse)
# dot plot
plot_ef_dot(lefse)
library(knitr)
dat <- marker_table(lefse) %>% data.frame() %>% select(1:4)
dat %>% flextable()
feature | enrich_group | ef_lda | pvalue |
|---|---|---|---|
f__Peptostreptococcaceae | pre | 3.751640 | 0.000065467363 |
o__Clostridiales_k__Bacteria | pre | 3.493069 | 0.012386655594 |
f__[Paraprevotellaceae] | pre | 2.852463 | 0.025074627500 |
f__Leptotrichiaceae | pre | 2.693686 | 0.016870489180 |
f__Ruminococcaceae | post | 4.767258 | 0.001687499285 |
f__Lachnospiraceae | post | 4.611532 | 0.011256072336 |
f__Rikenellaceae | post | 4.544018 | 0.000002408661 |
f__Bifidobacteriaceae | post | 4.104626 | 0.007168055327 |
f__Erysipelotrichaceae | post | 4.079664 | 0.024810140922 |
f__[Barnesiellaceae] | post | 3.808157 | 0.010962494793 |
o__Clostridiales_f__ | post | 3.778734 | 0.008928572756 |
f__Clostridiaceae | post | 3.738037 | 0.008496590045 |
f__[Odoribacteraceae] | post | 3.585347 | 0.004615594153 |
f__Christensenellaceae | post | 3.174403 | 0.002367646033 |
f__[Mogibacteriaceae] | post | 3.107154 | 0.000823396201 |
How to transform data in SE format: https://microbiome.github.io/OMA/data-introduction.html
# #BiocManager::install("lefser")
# library(lefser)
# #BiocManager::install("microbiome/mia", version='3.14')
# library(mia)
#
# counts <- ASVs$data # Abundance table (e.g. ASV data; to assay data)
# tax <- tax_table # Taxonomy table (to rowData)
# samples <- metadata # Sample data (to colData)
# se <- SummarizedExperiment(assays = list(counts = counts),
# colData = samples,
# rowData = tax)
#
# # Goes through the whole DataFrame. Removes '.*[kpcofg]__' from strings, where [kpcofg]
# # is any character from listed ones, and .* any character.
# rowdata_modified <- BiocParallel::bplapply(rowData(se),
# FUN = stringr::str_remove,
# pattern = '.*[kpcofg]__')
#
# # Genus level has additional '\"', so let's delete that also
# rowdata_modified <- BiocParallel::bplapply(rowdata_modified,
# FUN = stringr::str_remove,
# pattern = '\"')
#
# # rowdata_modified is a list, so it is converted back to DataFrame format.
# rowdata_modified <- DataFrame(rowdata_modified)
#
# # And then assigned back to the SE object
# rowData(se) <- rowdata_modified
#
# # Now we have a nicer table
# head(rowData(se))
#
# head(colData(se))
#
# #add relative abundances along the original count data
# se <- relAbundanceCounts(se)
# assays(se)
#
# assays(se)$counts[1:5,1:7]
# assay(se, "relabundance")[1:5,1:7]
#
# dim(se)
#
# #add phylogenetic tree
# tse <- as(se, "TreeSummarizedExperiment")
# # Add tree to rowTree
# rowTree(tse) <- tree$data
# # Check
# tse
#
# se <- tse
#
# #head(rowTree(tse))
# rowLinks(tse)
#
# # subset by sample purpose
# se_subset_by_sample <- se[ , se$sample.purpose %in% c("Receptor")]
#
# # show dimensions
# dim(se_subset_by_sample)
#
# head(colData(se_subset_by_sample))
#
# # add metadata column
#
# for(i in 1:dim(se_subset_by_sample)[2]) {
# if(se_subset_by_sample$time[i] <= 0){
# se_subset_by_sample$FMT_pre_post[i] <- "pre"
# } else {
# se_subset_by_sample$FMT_pre_post[i] <- "post"
# }
# }
#
#
# # subset by sample and feature and get rid of NAs
# se_subset_by_sample_feature <- se[rowData(se)$Phylum %in% c("Bacteria") & !is.na(rowData(se)$Phylum), se$sample.purpose %in% c("Receptor")]
#
# # show dimensions
# dim(se_subset_by_sample_feature)
# table(se_subset_by_sample$FMT_pre_post)
# res <- lefser(se_subset_by_sample, groupCol = "FMT_pre_post")
# head(res)
# lefserPlot(res)
# Importing ASVs abundance file
ASVs_merged <- read_qza(here("00_raw_data/09_hospital_data_qiime2","merged-table.qza"))
# Importing metadata
metadata_CDI <- read.delim(here("00_raw_data/07_hospital_CDI", "sample-metadata.tsv.gz"))
metadata_CDI$time <- as.numeric(metadata_CDI$time)
names(metadata_CDI)[names(metadata_CDI) == 'Sample.ID'] <- 'SampleID'
names(metadata_CDI)[names(metadata_CDI) == 'sample.origin'] <- 'condition'
for(i in 1:nrow(metadata_CDI)) {
if(metadata_CDI$time[i] <= 0 & !is.na(metadata_CDI$time[i])){
metadata_CDI$condition[i] <- paste(metadata_CDI$condition[i], "pre", sep="_")
} else if (metadata_CDI$time[i] > 0 & !is.na(metadata_CDI$time[i])) {
metadata_CDI$condition[i] <- paste(metadata_CDI$condition[i], "post", sep="_")
}
}
metadata_donor <- read.delim(here("00_raw_data/08_hospital_donor","sample-metadata.tsv.gz"))
metadata_donor$condition <- "healthy_donors"
names(metadata_donor)[names(metadata_donor) == 'sample_id'] <- 'SampleID'
metadata_merged <- rbind.fill(metadata_CDI, metadata_donor)
metadata_merged <- dplyr::select(metadata_merged, SampleID, condition)
rownames(metadata_merged) <- metadata_merged[,1]
# Importing tree
tree_merged <- read_qza(here("00_raw_data/09_hospital_data_qiime2","rooted-tree-merged.qza"))
# Importing taxonomy
taxonomy_merged <- read_qza(here("00_raw_data/09_hospital_data_qiime2","taxonomy-merged.qza"))
taxonomy_table_merged <- parse_taxonomy(taxonomy_merged$data)
tax_table_merged <- do.call(rbind, strsplit(as.character(taxonomy_merged$data$Taxon), "; "))
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)
colnames(tax_table_merged) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
rownames(tax_table_merged) <- taxonomy_merged$data$Feature.ID
# Creating phyloseq object
physeq_merged <- phyloseq(
otu_table(ASVs_merged$data, taxa_are_rows = TRUE),
phy_tree(tree_merged$data),
tax_table(tax_table_merged),
sample_data(metadata_merged)
)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
#Subset the data to keep only Receptor samples.
physeq_merged <- subset_samples(physeq_merged, condition == c("Cdif_pre", "Cdif_post", "healthy_donors"))
## Warning in condition == c("Cdif_pre", "Cdif_post", "healthy_donors"): longer
## object length is not a multiple of shorter object length
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
sample_data(physeq_merged)$condition <- as.factor(sample_data(physeq_merged)$condition)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
sample_data(physeq_merged)$condition <- relevel(sample_data(physeq_merged)$condition, "Cdif_pre")
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
levels(sample_data(physeq_merged)$condition)
## [1] "Cdif_pre" "Cdif_post" "healthy_donors"
Barplots :
# convert to relative abundance
physeq.rel <- microbiome::transform(physeq_merged, "compositional")
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
physeq.rel2 <- prune_taxa(taxa_sums(physeq.rel) > 0, physeq.rel)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
physeq.fam.rel <- physeq_merged %>%
aggregate_rare(level = "Family", detection = 1/100, prevalence = 75/100) %>%
microbiome::transform(transform = "compositional")
plot_composition(physeq.fam.rel, sample.sort = "condition", x.label = "condition") +
theme(legend.position = "bottom") +
scale_fill_brewer("Family", palette = "Paired") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size = 10)) +
ggtitle("Relative abundance") +
theme(legend.title = element_text(size = 18))
library(RColorBrewer)
physeq.fam.rel <- physeq_merged %>%
aggregate_rare(level = "Family", detection = 1/100, prevalence = 50/100) %>%
microbiome::transform(transform = "compositional")
colourCount = ntaxa(tax_table(physeq.fam.rel))
getPalette = colorRampPalette(brewer.pal(13, "Paired"))
## Warning in brewer.pal(13, "Paired"): n too large, allowed maximum for palette Paired is 12
## Returning the palette you asked for with that many colors
plot_composition(physeq.fam.rel,
average_by = "condition",
transform = "compositional") +
scale_fill_manual(values = getPalette(colourCount)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size = 10)) +
ggtitle("Relative abundance") +
theme(legend.title = element_text(size = 18))
mycols <- c("coral", "steelblue2", "darkolivegreen2")
physeq.fam <- aggregate_taxa(physeq_merged, "Family")
sample_data(physeq.fam)$condition <- as.factor(str_replace(sample_data(physeq.fam)$condition, "Cdif_pre", "Pre"))
sample_data(physeq.fam)$condition <- as.factor(str_replace(sample_data(physeq.fam)$condition, "Cdif_post", "Post"))
sample_data(physeq.fam)$condition <- as.factor(str_replace(sample_data(physeq.fam)$condition, "healthy_donors", "Donors"))
sample_data(physeq.fam)$condition <- relevel(sample_data(physeq.fam)$condition, ref = "Post")
sample_data(physeq.fam)$condition <- relevel(sample_data(physeq.fam)$condition, ref = "Pre")
top_f <- top_taxa(physeq.fam, 9)
top_f
## [1] "f__Bacteroidaceae" "f__Ruminococcaceae" "f__Lachnospiraceae"
## [4] "f__Veillonellaceae" "f__Enterobacteriaceae" "f__Rikenellaceae"
## [7] "f__Verrucomicrobiaceae" "f__Porphyromonadaceae" "f__Prevotellaceae"
top_fams <- plot_listed_taxa(physeq.fam,
top_f,
group= "condition",
group.colors = mycols,
add.violin = F,
dot.opacity = 0.25,
box.opacity = 0.25,
panel.arrange= "wrap")
## An additonal column Sam_rep with sample names is created for reference purpose
top_fams
dom.tax <- dominant_taxa(physeq_merged, level = "Family", group="condition")
dom.tax_pre <- dom.tax$dominant_overview %>% filter(condition == "Cdif_pre")
dom.tax_post <- dom.tax$dominant_overview %>% filter(condition == "Cdif_post")
dom.tax_donor <- dom.tax$dominant_overview %>% filter(condition == "healthy_donors")
head(dom.tax_pre) %>%
flextable()
condition | dominant_taxa | n | rel.freq | rel.freq.pct |
|---|---|---|---|---|
Cdif_pre | f__Bacteroidaceae | 3 | 17.6 | 18% |
Cdif_pre | f__Enterococcaceae | 3 | 17.6 | 18% |
Cdif_pre | f__Veillonellaceae | 3 | 17.6 | 18% |
Cdif_pre | f__Enterobacteriaceae | 2 | 11.8 | 12% |
Cdif_pre | f__Ruminococcaceae | 2 | 11.8 | 12% |
Cdif_pre | f__Verrucomicrobiaceae | 2 | 11.8 | 12% |
head(dom.tax_post)%>%
flextable()
condition | dominant_taxa | n | rel.freq | rel.freq.pct |
|---|---|---|---|---|
Cdif_post | f__Enterobacteriaceae | 5 | 21.7 | 22% |
Cdif_post | f__Lachnospiraceae | 5 | 21.7 | 22% |
Cdif_post | f__Bacteroidaceae | 3 | 13.0 | 13% |
Cdif_post | f__Ruminococcaceae | 3 | 13.0 | 13% |
Cdif_post | f__Verrucomicrobiaceae | 3 | 13.0 | 13% |
Cdif_post | f__Lactobacillaceae | 2 | 8.7 | 9% |
head(dom.tax_donor)%>%
flextable()
condition | dominant_taxa | n | rel.freq | rel.freq.pct |
|---|---|---|---|---|
healthy_donors | f__Bacteroidaceae | 28 | 71.8 | 72% |
healthy_donors | f__Ruminococcaceae | 5 | 12.8 | 13% |
healthy_donors | f__Lachnospiraceae | 3 | 7.7 | 8% |
healthy_donors | f__Prevotellaceae | 3 | 7.7 | 8% |
grp_abund <- get_group_abundances(physeq_merged,
level = "Family",
group="condition",
transform = "compositional")
## An additonal column Sam_rep with sample names is created for reference purpose
# clean names
grp_abund$OTUID <- gsub("f__", "",grp_abund$OTUID)
grp_abund$OTUID <- ifelse(grp_abund$OTUID == "",
"Unclassified", grp_abund$OTUID)
grp_abund_sorted <- grp_abund[order(grp_abund$mean_abundance, decreasing=TRUE),]
mean.plot_sorted <- grp_abund_sorted[1:40,] %>% # input data
ggplot(aes(x= reorder(OTUID, mean_abundance), # reroder based on mean abundance
y= mean_abundance,
fill=condition)) +
geom_bar(stat = "identity",
position = position_dodge()) +
scale_fill_manual("condition", values=mycols) + # manually specify colors
theme_bw() +
ylab("Mean Relative Abundance") +
xlab("Family") +
coord_flip()
mean.plot_sorted
Tree plot
physeq_top_50 <- subset_taxa(physeq_merged, Kingdom=="k__Bacteria")
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
physeq_top_50 <- prune_taxa(names(sort(taxa_sums(physeq_top_50),TRUE)[1:50]), physeq_top_50)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
# Color the nodes by category
plot_tree(physeq_top_50, nodelabf=nodeplotblank, label.tips="Genus", ladderize="left", color="condition")
# Convert to radial tree
plot_tree(physeq_top_50, nodelabf=nodeplotblank, label.tips="Genus", ladderize="left", color="condition") + coord_polar(theta="y")
Lefse to test for differential abundance between categories
from: https://rpubs.com/mohsen/lefse_analysis_cleaned_prevalence_phyloseq
# Differential abundance using lefse
lefse <- run_lefse(physeq_merged,
group = "condition",
taxa_rank = "Family",
wilcoxon_cutoff = 0.05,
norm = "CPM",
kw_cutoff = 0.05,
multigrp_strat = TRUE,
lda_cutoff = 2
)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
# bar plot
plot_ef_bar(lefse)
# dot plot
plot_ef_dot(lefse)
library(knitr)
dat <- marker_table(lefse) %>% data.frame() %>% select(1:4)
dat %>% flextable()
feature | enrich_group | ef_lda | pvalue |
|---|---|---|---|
f__Enterobacteriaceae | Cdif_pre | 5.019863 | 0.00000722865924779 |
f__Enterococcaceae | Cdif_pre | 4.888719 | 0.00000041798678037 |
f__Lactobacillaceae | Cdif_pre | 4.768221 | 0.00000000003656169 |
f__Peptostreptococcaceae | Cdif_pre | 3.882746 | 0.00001126451367478 |
k__Bacteria_k__Bacteria_k__Bacteria_k__Bacteria | Cdif_pre | 3.382887 | 0.00002234050035835 |
f__Carnobacteriaceae | Cdif_pre | 3.355318 | 0.00022394112443342 |
f__Actinomycetaceae | Cdif_pre | 3.078969 | 0.00210243838091560 |
f__[Tissierellaceae] | Cdif_pre | 3.058240 | 0.01088136973394066 |
f__Pasteurellaceae | Cdif_pre | 3.038437 | 0.01423156416571151 |
f__Leptotrichiaceae | Cdif_pre | 2.613028 | 0.02013020468359838 |
f__Gemellaceae | Cdif_pre | 2.582525 | 0.00342559972775908 |
f__Corynebacteriaceae | Cdif_pre | 2.234964 | 0.02487689077094919 |
f__Micrococcaceae | Cdif_pre | 2.125729 | 0.00365354080850151 |
f__Neisseriaceae | Cdif_pre | 2.083472 | 0.02487689077094919 |
o__Lactobacillales_k__Bacteria | Cdif_pre | 2.044632 | 0.01327155137579679 |
f__Streptococcaceae | Cdif_post | 4.727100 | 0.00000023814153347 |
f__Fusobacteriaceae | Cdif_post | 4.043192 | 0.00033522810850423 |
f__Clostridiaceae | Cdif_post | 3.735624 | 0.00125214914383243 |
o__Clostridiales_k__Bacteria | Cdif_post | 3.576153 | 0.03194429025549037 |
f__Leuconostocaceae | Cdif_post | 2.928322 | 0.00246241049233072 |
f__[Mogibacteriaceae] | Cdif_post | 2.846803 | 0.01080665000498568 |
f__Bacteroidaceae | healthy_donors | 5.223932 | 0.00000000262051031 |
f__Ruminococcaceae | healthy_donors | 5.003683 | 0.00000010823078621 |
f__Lachnospiraceae | healthy_donors | 4.827433 | 0.00129259456138810 |
f__Rikenellaceae | healthy_donors | 4.389663 | 0.00001242198127422 |
f__Porphyromonadaceae | healthy_donors | 4.312682 | 0.00000000108484095 |
f__[Barnesiellaceae] | healthy_donors | 4.028017 | 0.00000000039221370 |
f__[Paraprevotellaceae] | healthy_donors | 3.977319 | 0.00001201496291125 |
o__Clostridiales_f__ | healthy_donors | 3.955847 | 0.00099584336046843 |
f__[Odoribacteraceae] | healthy_donors | 3.809392 | 0.00000044752653400 |
f__Christensenellaceae | healthy_donors | 3.527962 | 0.00102425156499283 |
f__S24-7 | healthy_donors | 3.364287 | 0.00314413219170438 |
o__RF32_f__ | healthy_donors | 3.237709 | 0.00114169956828998 |
o__YS2_f__ | healthy_donors | 3.148024 | 0.03435315417650129 |
o__RF39_f__ | healthy_donors | 2.331961 | 0.00598693092425966 |
f__Oxalobacteraceae | healthy_donors | 2.196756 | 0.01885911458068856 |
f__Peptococcaceae | healthy_donors | 2.102568 | 0.02069257446309545 |